get_conus_spatial <- function(data, var, proj = 5070, abb = FALSE) {
    # Filter CONUS from a spatial US dataset and modify its projection.
    #
    # Args:
    #   data: Spatial US dataset to filter.
    #   var: Column of "data" that contains states' information.
    #   proj: EPSG code to project dataset to, defaults to EPSG:5070.
    #   abb: If TRUE, then states in "var" are denoted with abbreviations;
    #        If FALSE, then states in "var" are denoted by their full names.
    #
    # Returns:
    #   A spatial CONUS dataset projected to given EPSG code.
    if (abb) {
        conus <- filter(data,
            !(get(var)) %in% c("AK", "HI", "PR", "GU"))
    } else {
        conus <- filter(data,
            !(get(var)) %in% c("Alaska", "Hawaii", "Puerto Rico", "Guam"))
    }

    conus <- sf::st_transform(conus, proj)
    return(conus)
}

generate_tiles <- function(data, tile_type = TRUE) {
    # Generate tiled spatial data from four tessellation/coverage options.
    #
    # Args:
    #   data: Spatial dataset to tesselate
    #   tile_type: "square" - square coverage with "n = 70"
    #              "hexagonal" - hexagonal coverage with "n = 70"
    #              "voronoi" - voronoi tesselation
    #              "triangulate" - Delaunay triangulation
    #              none of the above - unmodified data
    #
    # Returns:
    #   A spatial, tiled dataset from a given dataset and
    #   tessellation/coverage type or unmodified data.

    if (tile_type == "square") {
        tiled_data <- sf::st_make_grid(data, n = 70)
    } else if (tile_type == "hexagonal") {
        tiled_data <- sf::st_make_grid(data, n = 70, square = FALSE)
    } else if (tile_type == "voronoi") {
        tiled_data <- sf::st_centroid(data) %>%
            sf::st_combine() %>%
            sf::st_voronoi() %>%
            sf::st_cast()
    } else if (tile_type == "triangulate") {
        tiled_data <- sf::st_centroid(data) %>%
            sf::st_combine() %>%
            sf::st_triangulate() %>%
            sf::st_cast()
    } else {
        return(data)
    }

    tiled_data <- sf::st_as_sf(tiled_data) %>%
        mutate(id = 1:n())

    return(tiled_data)
}

plot_map <- function(data, title = "", include_features = TRUE) {
    # Plot spatial data using ggplot() + geom_sf().
    #
    # Args:
    #   data: Spatial dataset to plot.
    #   title: Title shown on plot.
    #   include_features: If TRUE, include caption of features collected.
    #
    # Returns:
    #   A plotted ggplot() map with, a white background,
    #   navy border, size = 0.2, features collected, and given title.

    ggplot() +
        geom_sf(
            data = data,
            fill = "white",
            colour = "navy",
            size = 0.2
        ) +
        theme_void() +
        labs(
            title = title,
            caption = if (include_features) {
                paste(nrow(data), " features collected")
            } else {
                ""
            }
        )
}

plot_map_pip <- function(data, title = "", highlight_sd_mean = FALSE) {
    # Plot simple features point-in-polygon data using ggplot() + geom_sf().
    #
    # Args:
    #   data: simple features point-in-polygon dataset to plot.
    #   title: Title shown on plot.
    #   highlight_sd_mean: If TRUE, use gghighlight() to highlight
    #                      the above mean count of points in a polygon.
    #
    # Returns:
    #   A plotted ggplot() map using viridis colour scale.

    ggplot() +
        geom_sf(
            data = data,
            aes(fill = n),
            size = 0.2,
            col = NA
        ) +
        scale_fill_viridis(
            option = "plasma",
            name = "Number of Dams",
            guide = guide_colorbar(
                direction = "horizontal",
                barheight = unit(2, units = "mm"),
                barwidth = unit(50, units = "mm"),
                draw.ulim = F,
                title.position = "top",
                title.hjust = 0.5,
                label.hjust = 0.5
            )
        ) +
        theme_void() +
        theme(legend.position = "bottom") +
        labs(
            title = title,
            caption = paste(sum(data$n), " dams")
        ) + if (highlight_sd_mean) {
            gghighlight(
                    n > mean(data$n) + sd(data$n),
                    unhighlighted_params = list(alpha = 0.5, col = NA, fill = "#d4d4d4")
            )
        }
}

compute_features <- function(sf_data, description) {
    # Compute calculations on a "sf" object.
    #
    # Args:
    #   sf_data: A simple features object.
    #   description: A description of the data frame output.
    #
    # Returns:
    #   A data frame with "description" as the first column, and following,
    #   the number of features, mean area of the "sf" object, standard deviation
    #   of the area, and total area.

    features_area <- sf::st_area(sf_data) %>%
        set_units("km^2") %>%
        drop_units()

    output <- data.frame(
        DESCRIPTION = description,
        NUM_FEATURES = count(sf_data),
        MEAN_AREA = mean(features_area),
        STANDARD_DEV = sd(features_area),
        TOTAL_AREA = sum(features_area)
    )[-3]

    colnames(output)[2] <- "NUM_FEATURES" # Remove ".n" from col name

    return(output)
}

point_in_polygon <- function(points, polygon, compare) {
    # Performs point-in-polygon computation via a given column.
    #
    # Args:
    #   points: A spatial dataset of points.
    #   polygon: A spatial dataset of polygons.
    #   compare: The column from polygons to count the number
    #            of points in.
    #
    # Returns:
    #   A spatial dataset of two columns counting the points in any
    #   given polygon.
    st_join(polygon, points) %>%
        st_drop_geometry() %>%
        count(get(compare)) %>%
        setNames(c(compare, "n")) %>%
        left_join(polygon, by = compare) %>%
        st_as_sf()
}

Function Definitions

For explicit function code, click the above CODE button.

get_conus_spatial()

Filter CONUS from a spatial US dataset and modify its projection.

Args:
    data: Spatial US dataset to filter.
    var: Column of "data" that contains states' information.
    proj: EPSG code to project dataset to, defaults to EPSG:5070.
    abb: If TRUE, then states in "var" are denoted with abbreviations;
         If FALSE, then states in "var" are denoted by their full names.

Returns:
    A spatial CONUS dataset projected to given EPSG code.

generate_tiles()

Generate tiled spatial data from four tessellation/coverage options.

Args:
  data: Spatial dataset to tessellate
  tile_type: "square" - square coverage with "n = 70"
             "hexagonal" - hexagonal coverage with "n = 70"
             "voronoi" - voronoi tesselation
             "triangulate" - Delaunay triangulation
             none of the above - unmodified data

Returns:
  A spatial, tiled dataset from a given dataset and
  tessellation/coverage type or unmodified data.

plot_map()

Plot spatial data using ggplot() + geom_sf().

Args:
  data: Spatial dataset to plot.
  title: Title shown on plot.
  include_features: If TRUE, include caption of features collected.

Returns:
  A plotted ggplot() map with, a white background,
  navy border, size = 0.2, features collected, and given title.

plot_map_pip()

Plot simple features point-in-polygon data using ggplot() + geom_sf().

Args:
  data: simple features point-in-polygon dataset to plot.
  title: Title shown on plot.
  highlight_sd_mean: If TRUE, use gghighlight() to highlight
                     the above mean count of points in a polygon.

Returns:
  A plotted ggplot() map using viridis colour scale.

compute_features()

Compute calculations on a "sf" object.

Args:
  sf_data: A simple features object.
  description: A description of the data frame output.

Returns:
  A data frame with "description" as the first column, and following,
  the number of features, mean area of the "sf" object, standard deviation
  of the area, and total area.

point_in_polygon()

Performs point-in-polygon computation via a given column.

Args:
  points: A spatial dataset of points.
  polygon: A spatial dataset of polygons.
  compare: The column from polygons to count the number
           of points in.

Returns:
  A spatial dataset of two columns counting the points in any
  given polygon.

# Step 1.1
conus_counties <- get_conus_spatial(USAboundaries::us_counties(), "state_name")

# Step 1.2/1.3
conus_square <- generate_tiles(conus_counties, "square")
conus_hexagonal <- generate_tiles(conus_counties, "hexagonal")
conus_voronoi <- generate_tiles(conus_counties, "voronoi")
conus_triangulate <- generate_tiles(conus_counties, "triangulate")

# Testing before Step 1.4/1.5

Checking Plots

Square Coverage

Hexagonal Coverage

Voronoi Tessellation

Delaunay Triangulation


Tidying Tessellations

conus_border <- st_union(conus_counties)
conus_simple <- conus_border %>%
    rmapshaper::ms_simplify(keep = 0.05)

One thing to notice about the data that we are going to perform analysis on, in particular the tessellated spatial datasets, is that we have a lot of surrounding excess data. The goal here with tidying these datasets is to create a suitable dataset for performing analysis on, and in this case, that means specifically CONUS representative data. The first thing we will do is to create a simplified border of CONUS:

ggplot() +
    geom_sf(data = conus_border) +
    theme_void() +
        theme_void() +
    labs(subtitle = paste(
                        "Unioned CONUS Points: ",
                        mapview::npts(conus_border)
                    )
        )

ggplot() +
    geom_sf(data = conus_simple) +
    theme_void() +
    labs(subtitle = paste(
                        "Simplified CONUS Points: ",
                        mapview::npts(conus_simple)
                    )
        )

By simplifying we were able to remove 3068 points, while still keeping roughly the same shape that we want. While it may seem that we do not want to lose data, for the analysis that we will be doing, a great amount of detail isn’t necessary. Here, we want to visualize the overlap of our simplified CONUS border over our tessellations, so that we can see exactly what are trying to get rid of Note that, while the overlapping CONUS border looks like it is a MULTILINESTRING, it’s really a geom_sf() with fill = NA. Now, notice the border excludes most of the excess data surrounding the tessellated spatial datasets that we do not need:

# Overlapping Plot with CONUS map to cut edges
ggplot() +
    geom_sf(data = conus_voronoi, fill = NA) +
    geom_sf(
        data = conus_simple,
        color = "green",
        fill = NA) +
        theme_void() +
        labs(subtitle = paste(
                            "Simplified CONUS Points: ",
                            mapview::npts(conus_simple),
                            "\n",
                            "Voronoi Tessellation Points: ",
                            mapview::npts(conus_voronoi)
                       )
            )

# Overlapping Plot with CONUS map to cut edges
ggplot() +
    geom_sf(data = conus_triangulate, fill = NA) +
    geom_sf(
        data = conus_simple,
        color = "green",
        fill = NA) +
        theme_void() +
        labs(subtitle = paste(
                            "Simplified CONUS Points: ",
                            mapview::npts(conus_simple),
                            "\n",
                            "Delaunay Triangulation Points: ",
                            mapview::npts(conus_triangulate)
                       )
            )

Using sf::st_intersection(), we can get rid of the excess data. Using this function, we have the following spatial datasets:

# Plot tidied voronoi
conus_voronoi_tidy <- st_intersection(conus_voronoi, conus_simple)
plot_map(
    conus_voronoi_tidy,
    title = "Tidied Voronoi Tessellation",
    include_features = FALSE
)

# Plot tidied triangulation
conus_triangulate_tidy <- st_intersection(conus_triangulate, conus_simple)
plot_map(
    conus_triangulate_tidy,
    title = "Tidied Delaunay Triangulation",
    include_features = FALSE
)

So, this gives us each spatial dataset that we will be using:

Spatial Datasets

Original

Square Coverage

Hexagonal Coverage

voronoi Tessellation

Delaunay Triangulation

Features Computations

feature_summary <- bind_rows(
    compute_features(conus_counties, "US Counties"),
    compute_features(conus_square, "Square Coverage"),
    compute_features(conus_hexagonal, "Hexagonal Coverage"),
    compute_features(conus_voronoi_tidy, "Voronoi Tessellation"),
    compute_features(conus_triangulate_tidy, "Delaunay Triangulation")
)

knitr::kable(
    feature_summary,
    col.names = c(
        "Description",
        "Number of Features",
        "Area Mean",
        "Area Standard Deviation",
        "Total Area"
    ),
    caption = "Tessellation Features Data"
) %>%
    kableExtra::kable_styling(
        bootstrap_options = c("striped", "hover"),
        full_width = F,
        position = "center",
    ) %>%
    kableExtra::column_spec(
        2,
        color = kableExtra::spec_color(feature_summary$NUM_FEATURES, end = 0.75, option = "C", direction = -1)
    ) %>%
    kableExtra::column_spec(
        5,
        color = kableExtra::spec_color(feature_summary$TOTAL_AREA, end = 0.75, option = "C", direction = -1)
    )
Tessellation Features Data
Description Number of Features Area Mean Area Standard Deviation Total Area
US Counties 3108 2521.745 3404.326 7837583
Square Coverage 3108 2728.125 0.000 8479013
Hexagonal Coverage 2271 3763.052 0.000 8545892
Voronoi Tessellation 3106 2525.425 2886.661 7843970
Delaunay Triangulation 6195 1253.691 1578.647 7766614

When we analyze these tessellations, some important information needs to be taken into account. From the Tessellation Features Data table, we can observe a few different attributes, such as: number of features, mean of feature areas, standard deviation of feature areas, and the total area.

The importance of this data relates to the modifiable areal unit problem and inherent statistical bias; for example, consider the original dataset versus the other tessellations. When the original dataset is tessellated, there is a potential loss of accuracy with point-in-polygon analysis, as points may potentially fall into differing areas with the tessellated datasets. However, the tradeoffs for accuracy come with computation.

Take hexagonal coverage versus Delaunay triangulation for example. Since hexagonal coverage has a significantly smaller number of features than Delaunay triangulation, the efficiency of computing point-in-polygon analysis would be much faster. Although, depending on the time complexity of the method used for computing the analysis, this may not be as much of an issue (i.e. \(O(n)\) versus \(O(\log(n))\)).

us_dams <- readxl::read_excel("data/NID2019_U.xlsx") %>%
    filter(!is.na(LONGITUDE), !is.na(LATITUDE)) %>%
    st_as_sf(coords = c("LONGITUDE", "LATITUDE"), crs = 4326) %>%
    st_transform(5070)

pip_counties <- point_in_polygon(us_dams, conus_counties, "geoid")
pip_square <- point_in_polygon(us_dams, conus_square, "id")
pip_hexagonal <- point_in_polygon(us_dams, conus_hexagonal, "id")
pip_voronoi <- point_in_polygon(us_dams, conus_voronoi_tidy, "id")
pip_delaunay <- point_in_polygon(us_dams, conus_triangulate_tidy, "id")

Point-in-Polygon Analysis of Tessellations

Original

Square Coverage

Hexagonal Coverage

Voronoi Tessellation

Delaunay Triangulation

Moving forward, we will focus on the square coverage, as this provides an accurate, non-biased visualization of dam locations. Primarily, it removes the concept of county lines – which are susceptible to statistical bias (MUAP) – and allows us to focus on the more specific locations of dams in relation to the country as a whole (as opposed to in relation to counties).

To directly see the difference in bias, compare the locations in the original and voronoi datasets to the square and hexagonal.

# Flood Control
us_dams_c <- us_dams %>%
    filter(grepl("C", us_dams$PURPOSES))

# Fire Protection
us_dams_p <- us_dams %>%
    filter(grepl("P", us_dams$PURPOSES))

# Water Supply
us_dams_s <- us_dams %>%
    filter(grepl("S", us_dams$PURPOSES))

# Hydroelectric
us_dams_h <- us_dams %>%
    filter(grepl("H", us_dams$PURPOSES))

pip_square_c <- point_in_polygon(us_dams_c, conus_square, "id")
pip_square_p <- point_in_polygon(us_dams_p, conus_square, "id")
pip_square_s <- point_in_polygon(us_dams_s, conus_square, "id")
pip_square_h <- point_in_polygon(us_dams_h, conus_square, "id")

Visualizing Dam Purposes across the US

Flood Control (C)

Fire Protection (P)

Water Supply (S)

Hydroelectric (H)

Using a square tessellation allows for a greater accuracy of analyzing geographic distribution of dams in relation to environmental concerns, such as river systems and climate.

For example, when looking at dams for Water Supply (S), the greater average of dams are located in areas that require a higher need for water, either as a result from drought or irrigation (i.e. California).

Then, when thinking in terms of river systems, we can analyze the flood-risk for dams that intersect with the Mississippi river system. See below for an example.

Analyzing Flood-risk Potential Dams

max_storage_states <- us_dams %>%
    get_conus_spatial("STATE", abb = TRUE) %>%
    select(RECORDID, DAM_NAME, COUNTY, HAZARD, NID_STORAGE, STATE, geometry) %>%
    filter(HAZARD == "H") %>%
    group_by(STATE) %>%
    filter(NID_STORAGE == max(NID_STORAGE)) %>%
    slice(1) %>%
    ungroup() %>%
    st_join(us_dams, by = "DAM_NAME", left = TRUE, join = st_equals) %>%
    st_transform("+proj=longlat +datum=WGS84")

river_systems <- read_sf("data/MajorRivers/MajorRivers.shp")
m_river_system <- filter(river_systems, SYSTEM == "Mississippi")

leaflet(max_storage_states) %>%
    addTiles() %>%
    addCircleMarkers(
        radius = (~NID_STORAGE.x / 1500000),
        color = "red",
        stroke = FALSE,
        fillOpacity = 1,
        clusterOptions = markerClusterOptions(),
        popup = leafpop::popupTable(
            setNames(
                st_drop_geometry(
                    select(
                        max_storage_states,
                        DAM_NAME.x,
                        NID_STORAGE.x,
                        PURPOSES,
                        YEAR_COMPLETED
                    )
                ),
                c("Dam Name", "Storage", "Purposes", "Year Completed")
            ),
            row.numbers = FALSE,
            feature.id = FALSE
        )
    ) %>%
    addPolylines(
        data = m_river_system,
        color = "#294adb",
        weight = 1.5,
        smoothFactor = 0.5
    )